#encoding=utf8

import re,jieba,sys

def read_file(path):
	str_res = open(path,'r',encoding='utf-8').read()
	return str_res

def text_parse(str_txt):
	#去掉字符
	str_txt = re.sub('\u3000','',str_txt)

	#去除空格
	str_txt = re.sub('\s+','',str_txt)

	#去除换行符
	str_txt = str_txt.replace('\n',' ')

	#去除特殊字符
	rule = '[a-zA-Z0-9’!"#$%&\'()*+,-./:：;；|<=>?@，—。?★、…【】《》？“”‘’！[\\]^_`{|}~]+'
	str_txt = re.sub(rule,'',str_txt)

	return str_txt

def load_stop_wordlist(path=r'../dictionary/stopWords.txt'):
	word_list = open(path,'r',encoding='utf-8').read().split('\n')
	return set(word_list)

def clear_stop_words(words,stopWords):
	for i in range(words.__len__())[::-1]:
		if words[i] in stopWords: #去除停用词
			words.pop(i)
		elif words[i].isdigit():
			words.pop(i)
		elif len(words[i]) == 1:
			words.pop(i)
		elif words[i] == " ":
			words.pop(i)
	return words

def clear_for_cut(str_txt):
	res = []
	#文本清洗按行清洗
	str_list = str_txt.split('\n')
	str_list = map(text_parse,str_list)
	#获取停用词列表
	words_list = load_stop_wordlist()

	#分词后去除停用词
	for sentence in str_list:
		res.extend(clear_stop_words(jieba.lcut(sentence,cut_all=False),words_list))

	return res

if __name__ == '__main__':
	path = r'../CSCMNews/体育/0.txt'
	txt = read_file(path)
	print(clear_for_cut(txt))